# -*- coding: utf-8 -*-
"""
------------------------------------------------------------------------------
    File Name:  BaikeHtmlParser
    Author   :  wanwei1029
    Date     :  2018/5/17
    Desc     :
------------------------------------------------------------------------------
"""
import re
import urllib.parse as urlparse
from bs4 import BeautifulSoup


class BaikeHtmlParser(object):

    def parser(self, page_url, html_cont):
        if page_url is None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont, 'lxml', from_encoding='utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        new_data = self._get_new_data(page_url, soup)
        return new_urls, new_data

    def _get_new_urls(self, page_url, soup):
        new_urls = set()
        links = soup.find_all('a', href=re.compile("^/item/"))
        for link in links:
            new_urls.add(urlparse.urljoin(page_url,link["href"] ))
        return new_urls

    def _get_new_data(self, page_url, soup):
        data = {}
        data["url"] = page_url
        title = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1")
        data["title"] = title.get_text()
        summary = soup.find('div', class_="para", attrs={"label-module": "para"})
        data["summary"] = summary.get_text()
        return data
